# Author: Stephen Situ
# This is a project that explores the different ways data can be cleaned and pre processed for machine learning. The first method
# is just sequentially cleaning the data manually one step at a time. This is good for infrequent use. The next method is to 
# define a function and pass the dataframe into the function. The last and most preferred method is to create a pipeline that
# can utilize the scikit learn pipeline library. It sequentially applies a list of transformations and a final estimator.
# They must implement fit and transform methods. The final estimator only needs to implement fit.


# Import pandas and Numpy
import pandas as pd
import numpy as np


# Read CSV
df = pd.read_csv('cars_raw.csv')


# Head
df.head()


# Describe
df.describe()


# data types
df.dtypes

Year                       int64
Make                      object
Model                     object
Used/New                  object
Price                     object
ConsumerRating           float64
ConsumerReviews            int64
SellerType                object
SellerName                object
SellerRating             float64
SellerReviews              int64
StreetName                object
State                     object
Zipcode                   object
DealType                  object
ComfortRating            float64
InteriorDesignRating     float64
PerformanceRating        float64
ValueForMoneyRating      float64
ExteriorStylingRating    float64
ReliabilityRating        float64
ExteriorColor             object
InteriorColor             object
Drivetrain                object
MinMPG                     int64
MaxMPG                     int64
FuelType                  object
Transmission              object
Engine                    object
VIN                       object
Stock#                    object
Mileage                    int64
dtype: object


# drop columns
df = df.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)


# Head
df.head()


# delete $ from string "Price"
# delete "Not Priced"
df["Price"] = df["Price"].str.replace("$","")
df["Price"] = df["Price"].str.replace(",","")
df["Price"] = df["Price"].str.replace("Not Priced","")

C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\3117742891.py:3: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  df["Price"] = df["Price"].str.replace("$","")


# Cast Price to numeric
df["Price"] = pd.to_numeric(df["Price"])


# Impute price as mean
df['Price'].fillna((df['Price'].mean()), inplace=True)


# Hot encode Categirucal features
df = pd.get_dummies(df)
df.to_csv("clean_normal.csv",index=False)


# Head
df.head

<bound method NDFrame.head of       Year    Price  ConsumerRating  ConsumerReviews  SellerRating  \
0     2019  39998.0             4.6               45           3.3   
1     2018  49985.0             4.8              817           4.8   
2     2017  41860.0             4.7              495           4.6   
3     2021  28500.0             5.0               36           4.6   
4     2020  49000.0             4.8               76           4.8   
...    ...      ...             ...              ...           ...   
9374  2019  27374.0             4.7              205           4.4   
9375  2019  61998.0             4.8               27           4.8   
9376  2017  26944.0             4.8              137           4.7   
9377  2019  28568.0             4.7              279           4.4   
9378  2019  32091.0             4.8              204           4.4   

      SellerReviews  ComfortRating  InteriorDesignRating  PerformanceRating  \
0                 3            4.7                   4.6                4.6   
1               131            4.9                   4.8                4.8   
2               249            4.8                   4.7                4.8   
3               284            4.9                   5.0                4.9   
4              4755            4.9                   4.8                4.8   
...             ...            ...                   ...                ...   
9374            443            4.7                   4.7                4.6   
9375           1789            4.9                   4.8                4.8   
9376            831            4.9                   4.8                4.7   
9377            680            4.8                   4.7                4.6   
9378           1105            4.9                   4.9                4.6   

      ValueForMoneyRating  ...  State_TX  State_US-12  State_US-169  State_UT  \
0                     4.4  ...         0            0             0         0   
1                     4.6  ...         0            0             0         0   
2                     4.6  ...         0            0             0         0   
3                     5.0  ...         0            0             0         0   
4                     4.7  ...         0            0             0         0   
...                   ...  ...       ...          ...           ...       ...   
9374                  4.7  ...         0            0             0         0   
9375                  4.6  ...         0            0             0         0   
9376                  4.6  ...         0            0             0         0   
9377                  4.7  ...         0            0             0         0   
9378                  4.8  ...         0            0             0         0   

      State_VA  State_VT  State_WA  State_WI  State_WV  State_WY  
0            0         0         0         0         0         0  
1            0         0         0         0         0         0  
2            0         0         0         0         0         0  
3            0         0         0         0         0         0  
4            0         0         0         0         0         0  
...        ...       ...       ...       ...       ...       ...  
9374         0         0         0         0         0         0  
9375         0         0         0         0         0         0  
9376         0         0         0         0         0         0  
9377         0         0         0         0         0         0  
9378         0         0         0         0         0         0  

[9379 rows x 117 columns]>


# Create a function to preform the cleaning and transformation task and test it
def clean(X):
    X = X.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
    X["Price"] = X["Price"].str.replace("$","")
    X["Price"] = X["Price"].str.replace(",","")
    X["Price"] = X["Price"].str.replace("Not Priced","")
    X["Price"] = pd.to_numeric(X["Price"])
    X['Price'].fillna((X['Price'].mean()), inplace=True)
    X = pd.get_dummies(X)
    X.to_csv("clean_function.csv",index=False)  
    return(X)


# Should get the same result as before
df2 = pd.read_csv('cars_raw.csv')
clean(df2)

C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\2668432355.py:4: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  X["Price"] = X["Price"].str.replace("$","")


# Using sci-kit learn we can create a pipeline that uses estimators and transformers
from sklearn.base import BaseEstimator, TransformerMixin

class Columndropper(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        return X.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
    
class PriceCleaner(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self, X):  
        X["Price"] = X["Price"].str.replace("$","")
        X["Price"] = X["Price"].str.replace(",","")
        X["Price"] = X["Price"].str.replace("Not Priced","")
        return X
    
class CastNumeric(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self    
    def transform(self, X):
        X["Price"] = pd.to_numeric(X["Price"])
        return X
    
class Imputer(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self 
    def transform(self, X):   
        X['Price'].fillna((X['Price'].mean()), inplace=True)
        return X
    
class HotEncode(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self 
    def transform(self, X):     
        X = pd.get_dummies(X)
        return X


# Use a new dataframe, df3
df3 = pd.read_csv('cars_raw.csv')


# Call the functions
# now we can use classes/methods/attributes to do the transformation
column_drop = Columndropper()
price_cleaner = PriceCleaner()
cast_numeric = CastNumeric()
imputer = Imputer()
hot_encode = HotEncode()
df_clean = hot_encode.transform(imputer.transform(cast_numeric.transform(price_cleaner.transform(column_drop.transform(df3)))))
df_clean.head()

C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\455890638.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  X["Price"] = X["Price"].str.replace("$","")


df4 = pd.read_csv('cars_raw.csv')


# Define pipe as a list of tuples 
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ("column_drop", Columndropper()),
    ("price_cleaner",  PriceCleaner()),
    ("cast_numeric", CastNumeric()),
    ("imputer",Imputer()),
    ("hot_encode",HotEncode())
])


# Use fit_transform and get the same result
df_pipe_cleaned = pipe.fit_transform(df4)
df_pipe_cleaned.to_csv("clean_pipeline.csv",index=False)  
df_pipe_cleaned.head()

C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\455890638.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  X["Price"] = X["Price"].str.replace("$","")

	Year	Make	Model	Used/New	Price	ConsumerRating	ConsumerReviews	SellerType	SellerName	SellerRating	...	InteriorColor	Drivetrain	MinMPG	MaxMPG	FuelType	Transmission	Engine	VIN	Stock#	Mileage
0	2019	Toyota	Sienna SE	Used	$39,998	4.6	45	Dealer	CarMax Murrieta - Now offering Curbside Pickup...	3.3	...	Black	Front-wheel Drive	19	27	Gasoline	8-Speed Automatic	3.5L V6 24V PDI DOHC	5TDXZ3DC2KS015402	22998646	29403
1	2018	Ford	F-150 Lariat	Used	$49,985	4.8	817	Dealer	Giant Chevrolet	4.8	...	Black	Four-wheel Drive	19	24	Gasoline	10-Speed Automatic	3.5L V6 24V PDI DOHC Twin Turbo	1FTEW1EG2JFD44217	22418A	32929
2	2017	RAM	1500 Laramie	Used	$41,860	4.7	495	Dealer	Gill Auto Group Madera	4.6	...	Black	Four-wheel Drive	15	21	Gasoline	8-Speed Automatic	5.7L V8 16V MPFI OHV	1C6RR7VT5HS842283	NG277871G	23173
3	2021	Honda	Accord Sport SE	Used	$28,500	5.0	36	Dealer	AutoSavvy Las Vegas	4.6	...	–	Front-wheel Drive	29	35	Gasoline	Automatic CVT	1.5L I4 16V GDI DOHC Turbo	1HGCV1F49MA038035	54237	10598
4	2020	Lexus	RX 350	Used	$49,000	4.8	76	Dealer	Lexus of Henderson	4.8	...	Birch	Front-wheel Drive	20	27	Gasoline	8-Speed Automatic	3.5L V6 24V PDI DOHC	2T2AZMAA8LC156270	HDT4181A	28137

	Year	ConsumerRating	ConsumerReviews	SellerRating	SellerReviews	ComfortRating	InteriorDesignRating	PerformanceRating	ValueForMoneyRating	ExteriorStylingRating	ReliabilityRating	MinMPG	MaxMPG	Mileage
count	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000	9379.000000
mean	2018.721719	4.702825	133.187014	4.412571	984.089988	4.771895	4.727391	4.696290	4.537083	4.782194	4.681746	22.755411	29.216548	37463.023350
std	2.221708	0.240795	154.985640	0.626258	1609.039864	0.217822	0.194391	0.253664	0.338098	0.171537	0.368161	14.812869	12.809783	24970.342569
min	2001.000000	2.500000	1.000000	1.000000	1.000000	3.000000	2.800000	1.000000	1.000000	3.000000	1.000000	0.000000	0.000000	121.000000
25%	2018.000000	4.700000	30.000000	4.300000	112.000000	4.700000	4.700000	4.600000	4.500000	4.700000	4.600000	18.000000	25.000000	18666.500000
50%	2019.000000	4.800000	75.000000	4.600000	542.000000	4.800000	4.800000	4.700000	4.600000	4.800000	4.800000	20.000000	27.000000	32907.000000
75%	2020.000000	4.800000	182.000000	4.800000	1272.000000	4.900000	4.800000	4.800000	4.700000	4.900000	4.900000	24.000000	31.000000	47698.000000
max	2022.000000	5.000000	817.000000	5.000000	27824.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	150.000000	133.000000	234114.000000

	Year	Price	ConsumerRating	ConsumerReviews	SellerRating	SellerReviews	ComfortRating	InteriorDesignRating	PerformanceRating	ValueForMoneyRating	...	State_TX	State_US-12	State_US-169	State_UT	State_VA	State_VT	State_WA	State_WI	State_WV	State_WY
0	2019	39998.0	4.6	45	3.3	3	4.7	4.6	4.6	4.4	...	0	0	0	0	0	0	0	0	0	0
1	2018	49985.0	4.8	817	4.8	131	4.9	4.8	4.8	4.6	...	0	0	0	0	0	0	0	0	0	0
2	2017	41860.0	4.7	495	4.6	249	4.8	4.7	4.8	4.6	...	0	0	0	0	0	0	0	0	0	0
3	2021	28500.0	5.0	36	4.6	284	4.9	5.0	4.9	5.0	...	0	0	0	0	0	0	0	0	0	0
4	2020	49000.0	4.8	76	4.8	4755	4.9	4.8	4.8	4.7	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9374	2019	27374.0	4.7	205	4.4	443	4.7	4.7	4.6	4.7	...	0	0	0	0	0	0	0	0	0	0
9375	2019	61998.0	4.8	27	4.8	1789	4.9	4.8	4.8	4.6	...	0	0	0	0	0	0	0	0	0	0
9376	2017	26944.0	4.8	137	4.7	831	4.9	4.8	4.7	4.6	...	0	0	0	0	0	0	0	0	0	0
9377	2019	28568.0	4.7	279	4.4	680	4.8	4.7	4.6	4.7	...	0	0	0	0	0	0	0	0	0	0
9378	2019	32091.0	4.8	204	4.4	1105	4.9	4.9	4.6	4.8	...	0	0	0	0	0	0	0	0	0	0